# -*- coding: utf-8 -*-
"""
author     : LiGe
date       : 2019-02-15 16:03
description: 抓取国家统计局行政代码数据
"""
import time
import requests
from lxml import etree
from mysql import RegionCodeDao

session = requests.session()
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018'

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/587.18 (KHTML, like Gecko) Chrome/57.0.1208.140 Safari/537.18',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Accept-Encoding': 'gzip, deflate',
    'Cache-Control': 'max-age=0',
}

regionCodeDao = RegionCodeDao()


# 抓取行政代码数据
def getRegionCode():
    # 省份/直辖市
    response = session.get("%s/%s" % (url, 'index.html'), headers=header)
    response.encoding = 'gbk'
    provinceCodeList = regionCodeDao.queryRegionCodeList('0')
    xpathHtml = etree.HTML(response.text)
    labelAs = xpathHtml.xpath('//tr[@class=\'provincetr\']/td/a')
    for labelA in labelAs:
        text = labelA.text
        code = labelA.attrib['href'].split('.')[0]
        count = 0
        isCrawlerProvince = 1
        for provinceCode in provinceCodeList:
            if count == (len(provinceCodeList) - 1):
                break
            count += 1
            if code == provinceCode[0]:
                isCrawlerProvince = 0
                break
        if isCrawlerProvince == 0:
            continue
        print('开始抓取%s ...' % text)
        regionCodeDao.insertRegionCode((code, '0', text))
        getCity(paramCode=code, paramText=text)
    print('抓取完成.')


# 市级
def getCity(paramCode, paramText):
    response = session.get('%s/%s.html' % (url, paramCode), headers=header)
    response.encoding = 'gbk'
    cityCodeList = regionCodeDao.queryRegionCodeList(paramCode)
    xpathHtml = etree.HTML(response.text)
    labelAs = xpathHtml.xpath('//tr[@class=\'citytr\']/td[last()]/a')
    for labelA in labelAs:
        codeArray = labelA.attrib['href'].split('.')[0].split('/')
        if labelA.text == '市辖区':
            text = paramText
        else:
            text = labelA.text
        parentCode = codeArray[0]
        code = codeArray[1]
        count = 0
        isCrawlerCity = 1
        for cityCode in cityCodeList:
            if count == (len(cityCodeList) - 1):
                break
            count += 1
            if code == cityCode[0]:
                isCrawlerCity = 0
                break
        if isCrawlerCity == 0:
            continue
        print('开始抓取%s ...' % text)
        regionCodeDao.insertRegionCode((code, parentCode, text))
        getCounty(paramCode=code, paramParentCode=paramCode)
        # time.sleep(1)


# 县/区
def getCounty(paramCode, paramParentCode):
    response = session.get('%s/%s/%s.html' % (url, paramParentCode, paramCode), headers=header)
    response.encoding = 'gbk'
    countyCodeList = regionCodeDao.queryRegionCodeList(paramCode)
    xpathHtml = etree.HTML(response.text)
    labelAs = xpathHtml.xpath('//tr[@class=\'countytr\']/td[last()]/a')
    for labelA in labelAs:
        codeArray = labelA.attrib['href'].split('.')[0].split('/')
        text = labelA.text
        parentCode = paramCode
        code = codeArray[1]
        count = 0
        isCrawlerCounty = 1
        for countyCode in countyCodeList:
            if count == (len(countyCodeList) - 1):
                break
            count += 1
            if code == countyCode[0]:
                isCrawlerCounty = 0
                break
        if isCrawlerCounty == 0:
            continue
        print('开始抓取%s ...' % text)
        regionCodeDao.insertRegionCode((code, parentCode, text))
        getTown(paramCode=code, paramParentCode=paramCode)
        # time.sleep(1)


# 镇/乡
def getTown(paramCode, paramParentCode):
    response = session.get('%s/%s/%s.html' % (url, ('%s/%s' % (paramParentCode[:2], paramParentCode[2:])), paramCode),
                           headers=header)
    response.encoding = 'gbk'
    xpathHtml = etree.HTML(response.text)
    labelAs = xpathHtml.xpath('//tr[@class=\'towntr\']/td[last()]/a')
    for labelA in labelAs:
        # print('%s=%s' % (labelA.text, labelA.attrib['href']))
        codeArray = labelA.attrib['href'].split('.')[0].split('/')
        text = labelA.text
        parentCode = paramCode
        code = codeArray[1]
        regionCodeDao.insertRegionCode((code, parentCode, text))


if __name__ == "__main__":
    getRegionCode()
